Include instructions for installing specific version of musicatk
If you are attempting to reproduce this analysis, you can 1) install/load renv, 2) copy/paste the renv.lock file into your current working directory and 3) run the renv::restore() command to automatically install all of the packages with the same version.
# Make sure the 'renv.lock' file is in your current working directory.
install.packages("renv")
library(renv)
renv::restore()
library(musicatk)
library("TCGAbiolinks")
library("withr")
tcga_datasets <- c("TCGA-LAML", "TCGA-ACC", "TCGA-BLCA", "TCGA-LGG", "TCGA-BRCA", "TCGA-CESC", "TCGA-CHOL", "TCGA-COAD", "TCGA-ESCA", "TCGA-GBM", "TCGA-HNSC", "TCGA-KICH", "TCGA-KIRC", "TCGA-KIRP", "TCGA-LIHC", "TCGA-LUAD", "TCGA-LUSC", "TCGA-DLBC", "TCGA-MESO", "TCGA-OV", "TCGA-PAAD", "TCGA-PCPG", "TCGA-PRAD", "TCGA-READ", "TCGA-SARC", "TCGA-SKCM", "TCGA-STAD", "TCGA-TGCT", "TCGA-THYM", "TCGA-THCA", "TCGA-UCS", "TCGA-UCEC", "TCGA-UVM")
types <- substring(tcga_datasets, 6)
dataset.list <- list()
annot <- list()
for(i in seq_along(tcga_datasets)) {
query <- GDCquery(project = tcga_datasets[i],
data.category = "Simple Nucleotide Variation",
data.type = "Masked Somatic Mutation",
workflow.type = "MuTect2 Variant Aggregation and Masking",
experimental.strategy = "WXS",
data.format = "maf")
GDCdownload(query)
data <- GDCprepare(query)
dataset.list[[types[i]]] <- data.frame(extract_variants_from_matrix(data), tumor_type=types[i], stringsAsFactors = FALSE)
}
select_tumor_types <- c("COAD", "LUAD", "SKCM", "UCEC")
dataset.select <- do.call(rbind, dataset.list[select_tumor_types])
annot.select <- unique(dataset.select[,c("sample", "tumor_type")])
g <- select_genome("38")
musica <- create_musica(dataset.select, g)
## Checking that chromosomes in the 'variant' object match chromosomes in the 'genome' object.
## Checking that the reference bases in the 'variant' object match the reference bases in the 'genome' object.
## Standardizing INS/DEL style
## Converting adjacent SBS into DBS
## 16044 SBS converted to DBS
matched_annot <- annot.select[match(samp_annot(musica)$Samples, annot.select[,1]), 2]
samp_annot(musica, "Tumor_Types") <- matched_annot
build_standard_table(musica, g, "SBS96")
## Building count table from SBS with SBS96 schema
res = discover_signatures(musica = musica, table_name = "SBS96",
num_signatures = 4, algorithm = "lda",
seed = 12345, nstart = 1)
# Plot exposures
plot_exposures(res, proportional = FALSE, sort_samples = "total",
num_samples = 150)
plot_exposures(res, proportional = TRUE, sort_samples = "Signature3")
plot_exposures(res, proportional = TRUE, num_samples = 8, sort_samples = "name")
# Plot signatures
plot_signatures(res)
# Plot exposures by signature
plot_exposures(result = res, proportional = TRUE, sort_samples = "total",
plot_type = "box", group_by = "signature",
annotation = "Tumor_Types", color_by = "annotation")
# Plot exposures by tumor type
plot_exposures(result = res, proportional = TRUE, sort_samples = "Signature4",
plot_type = "bar", group_by = "annotation",
annotation = "Tumor_Types", color_by = "signature")
with_seed(1, create_umap(res))
plot_umap(result = res, color_by = "annotation", annotation = "Tumor_Types")
plot_umap(res, "signatures")
plot_heatmap(res_annot = res, proportional = TRUE, scale = TRUE,
annotation = "Tumor_Types")
compare_cosmic_v2(res, threshold = 0.88)
## cosine x_sig_index y_sig_index x_sig_name y_sig_name
## 4 0.9966914 4 10 Signature4 Signature10
## 1 0.9951916 1 7 Signature1 Signature7
## 2 0.9766938 2 6 Signature2 Signature6
## 3 0.8843553 3 4 Signature3 Signature4
clust <- cluster_exposure(result = res, nclust = 4, iter.max = 100, tol = 1e-14)
## Metric: 'euclidean'; comparing: 1963 vectors.
plot_umap(result = res, color_by = "cluster", annotation = "Tumor_Types",
clust = clust)
glm_stats <- exposure_differential_analysis(musica_result = res,
annotation = "Tumor_Types",
method = "glm")
plot_differential_analysis(glm_stats, "glm", 4)
dataset <- do.call(rbind, dataset.list)
annot <- unique(dataset[,c("sample", "tumor_type")])
g <- select_genome("38")
tcga <- create_musica(dataset, g)
## Checking that chromosomes in the 'variant' object match chromosomes in the 'genome' object.
## Checking that the reference bases in the 'variant' object match the reference bases in the 'genome' object.
## Standardizing INS/DEL style
## Converting adjacent SBS into DBS
## 24594 SBS converted to DBS
matched_annot <- annot[match(samp_annot(tcga)$Samples, annot[,1]), 2]
samp_annot(tcga, "Tumor_Types") <- matched_annot
build_standard_table(tcga, g, "SBS96")
tcga_sbs_subset <- subset_musica_by_counts(tcga, "SBS96", 5)
tcga_v3_sbs <- with_seed(1, auto_predict_grid(musica = tcga_sbs_subset,
table_name = "SBS96", signature_res = cosmic_v3_sbs_sigs_exome,
algorithm = "lda", sample_annotation = "Tumor_Types"))
with_seed(1, create_umap(tcga_v3_sbs))
plot_umap(result = tcga_v3_sbs, proportional = TRUE, color_by = "annotation", annotation = "Tumor_Types", add_annotation_labels = TRUE, annotation_text_box = TRUE, annotation_label_size = 6, legend = FALSE, strip_axes = TRUE)
plot_umap(tcga_v3_sbs, same_scale = FALSE)
build_standard_table(tcga, g, "DBS78")
tcga_dbs_subset <- subset_musica_by_counts(tcga, "DBS78", 5)
tcga_v3_dbs <- with_seed(1, auto_predict_grid(musica = tcga_dbs_subset,
table_name = "DBS78", signature_res = cosmic_v3_dbs_sigs,
algorithm = "lda", sample_annotation = "Tumor_Types"))
with_seed(1, create_umap(tcga_v3_dbs))
plot_umap(result = tcga_v3_dbs, proportional = TRUE, color_by = "annotation", annotation = "Tumor_Types", add_annotation_labels = TRUE, annotation_text_box = TRUE, annotation_label_size = 6, legend = FALSE, strip_axes = TRUE)
plot_umap(tcga_v3_dbs, same_scale = FALSE)
build_standard_table(tcga, g, "IND83")
tcga_ind_subset <- subset_musica_by_counts(musica = tcga, table_name = "IND83", num_counts = 5)
tcga_v3_ind <- with_seed(1, auto_predict_grid(musica = tcga_ind_subset,
table_name = "IND83", signature_res = cosmic_v3_indel_sigs,
algorithm = "lda", sample_annotation = "Tumor_Types"))
with_seed(1, create_umap(tcga_v3_ind))
plot_umap(result = tcga_v3_ind, proportional = TRUE, color_by = "annotation", annotation = "Tumor_Types", add_annotation_labels = TRUE, annotation_text_box = TRUE, annotation_label_size = 6, legend = FALSE, strip_axes = TRUE)
plot_umap(tcga_v3_ind, same_scale = FALSE, legend = FALSE, strip_axes = TRUE)
sessionInfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-apple-darwin17.7.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
##
## Matrix products: default
## BLAS/LAPACK: /usr/local/Cellar/openblas/0.3.10_1/lib/libopenblasp-r0.3.10.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] withr_2.4.1 TCGAbiolinks_2.18.0 musicatk_1.1.1
## [4] NMF_0.23.0 Biobase_2.50.0 BiocGenerics_0.36.0
## [7] cluster_2.1.0 rngtools_1.5 pkgmaker_0.32.2
## [10] registry_0.5-1
##
## loaded via a namespace (and not attached):
## [1] circlize_0.4.12
## [2] BiocFileCache_1.14.0
## [3] plyr_1.8.6
## [4] BiocParallel_1.24.1
## [5] topicmodels_0.2-12
## [6] GenomeInfoDb_1.26.2
## [7] ggplot2_3.3.3
## [8] gridBase_0.4-7
## [9] digest_0.6.27
## [10] foreach_1.5.1
## [11] htmltools_0.5.1.1
## [12] magrittr_2.0.1
## [13] memoise_2.0.0
## [14] BSgenome_1.58.0
## [15] tm_0.7-8
## [16] doParallel_1.0.16
## [17] ComplexHeatmap_2.6.2
## [18] Biostrings_2.58.0
## [19] readr_1.4.0
## [20] matrixStats_0.58.0
## [21] BSgenome.Hsapiens.UCSC.hg38_1.4.3
## [22] R.utils_2.10.1
## [23] askpass_1.1
## [24] prettyunits_1.1.1
## [25] colorspace_2.0-0
## [26] ggrepel_0.9.1
## [27] blob_1.2.1
## [28] rvest_0.3.6
## [29] rappdirs_0.3.3
## [30] xfun_0.21
## [31] dplyr_1.0.4
## [32] crayon_1.4.1
## [33] RCurl_1.98-1.2
## [34] jsonlite_1.7.2
## [35] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2
## [36] iterators_1.0.13
## [37] glue_1.4.2
## [38] gtable_0.3.0
## [39] zlibbioc_1.36.0
## [40] XVector_0.30.0
## [41] GetoptLong_1.0.5
## [42] DelayedArray_0.16.1
## [43] TxDb.Hsapiens.UCSC.hg38.knownGene_3.10.0
## [44] shape_1.4.5
## [45] scales_1.1.1
## [46] DBI_1.1.1
## [47] Rcpp_1.0.6
## [48] xtable_1.8-4
## [49] progress_1.2.2
## [50] clue_0.3-58
## [51] bit_4.0.4
## [52] matrixTests_0.1.9
## [53] stats4_4.0.2
## [54] httr_1.4.2
## [55] RColorBrewer_1.1-2
## [56] ellipsis_0.3.1
## [57] modeltools_0.2-23
## [58] pkgconfig_2.0.3
## [59] XML_3.99-0.5
## [60] R.methodsS3_1.8.1
## [61] farver_2.0.3
## [62] uwot_0.1.10
## [63] dbplyr_2.1.0
## [64] tidyselect_1.1.0
## [65] labeling_0.4.2
## [66] rlang_0.4.10
## [67] reshape2_1.4.4
## [68] AnnotationDbi_1.52.0
## [69] munsell_0.5.0
## [70] tools_4.0.2
## [71] cachem_1.0.4
## [72] downloader_0.4
## [73] generics_0.1.0
## [74] RSQLite_2.2.3
## [75] evaluate_0.14
## [76] stringr_1.4.0
## [77] fastmap_1.1.0
## [78] yaml_2.2.1
## [79] knitr_1.31
## [80] bit64_4.0.5
## [81] purrr_0.3.4
## [82] slam_0.1-48
## [83] R.oo_1.24.0
## [84] xml2_1.3.2
## [85] biomaRt_2.46.3
## [86] compiler_4.0.2
## [87] curl_4.3
## [88] png_0.1-7
## [89] tibble_3.0.6
## [90] stringi_1.5.3
## [91] highr_0.8
## [92] TCGAbiolinksGUI.data_1.10.0
## [93] GenomicFeatures_1.42.1
## [94] RSpectra_0.16-0
## [95] lattice_0.20-41
## [96] Matrix_1.3-2
## [97] vctrs_0.3.6
## [98] pillar_1.4.7
## [99] lifecycle_1.0.0
## [100] combinat_0.0-8
## [101] GlobalOptions_0.1.2
## [102] RcppAnnoy_0.0.18
## [103] data.table_1.13.6
## [104] cowplot_1.1.1
## [105] bitops_1.0-6
## [106] rtracklayer_1.50.0
## [107] GenomicRanges_1.42.0
## [108] R6_2.5.0
## [109] gridExtra_2.3
## [110] IRanges_2.24.1
## [111] codetools_0.2-18
## [112] MCMCprecision_0.4.0
## [113] MASS_7.3-53
## [114] gtools_3.8.2
## [115] assertthat_0.2.1
## [116] philentropy_0.4.0
## [117] SummarizedExperiment_1.20.0
## [118] openssl_1.4.3
## [119] rjson_0.2.20
## [120] GenomicAlignments_1.26.0
## [121] Rsamtools_2.6.0
## [122] S4Vectors_0.28.1
## [123] GenomeInfoDbData_1.2.4
## [124] hms_1.0.0
## [125] grid_4.0.2
## [126] tidyr_1.1.2
## [127] rmarkdown_2.6
## [128] MatrixGenerics_1.2.1
## [129] Cairo_1.5-12.2
## [130] NLP_0.2-1